In [1]:
# Required packages
import numpy as np 
import pandas as pd 
import pycountry
import pycountry_convert
import re

# Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from wordcloud import WordCloud
from plotly.subplots import make_subplots
import plotly.express as px

# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")

# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

State of Data Science and Machine Learning

In this article, we visualize the data available from the Kaggle survey in three consecutive years (2017, 2018, and 2019). The results include raw numbers about who is working with data, what’s happening with machine learning in different industries, and the best ways for new data scientists to break into the field. We've published the data in as raw a format as possible without compromising anonymization, which makes it an unusual example of a survey dataset.

Loading the data

In [2]:
Data19 = pd.read_csv('kaggle-survey-2019/multiple_choice_responses.csv', header=1)
Data18 = pd.read_csv('kaggle-survey-2018/multipleChoiceResponses.csv', header=1)
Data17 = pd.read_csv('kaggle-survey-2017/multipleChoiceResponses.csv',encoding='ISO-8859-1')
In [3]:
def Search(Mylist, key): return [s for s in Mylist if key in s]
def Search_df(df, key):
    Mylist = df.columns.tolist()
    return [s for s in Mylist if key in s]

Preprocessing

Columns

Renaming Columns

In [4]:
def Rename_func(df):
    return df.rename(columns = {'In which country do you currently reside?':'Country',
                                'What is your gender? - Selected Choice': 'Gender',
                               'GenderSelect': 'Gender',
                               'What is your age (# years)?':'Age Group',
                               'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?':'FormalEducation',
                               'Select the title most similar to your current role (or most recent title if retired): - Selected Choice':'CurrentJobTitle',
                               'CurrentJobTitleSelect':'CurrentJobTitle',
                               'What is your current yearly compensation (approximate $USD)?':'CurrentSalary',
                               'What is the size of the company where you are employed?':'CompanySize',
                               'Approximately how many individuals are responsible for data science workloads at your place of business?':'DataScienceTeamSize'
                               })

Data19 = Rename_func(Data19)
Data18 = Rename_func(Data18)
Data17 = Rename_func(Data17)
del Rename_func

Droping Columns

In [5]:
Cols = {'What is your gender? - Prefer to self-describe - Text',
        'Select the title most similar to your current role (or most recent title if retired): - Other - Text',
        'Select any activities that make up an important part of your role at work: (Select all that apply) - Other - Text'}
Data18.drop(columns = Cols, inplace = True)
Data19.drop(columns = Cols, inplace = True)
In [6]:
Columns = Data19.columns.tolist()
Select_Cols = Search(Columns, 'Select')

Country

In [7]:
def Countries_func(Col):
    return Col.replace(
        {'United States of America': 'United States', 'Viet Nam': 'Vietnam', "People 's Republic of China": 'China',
         'Republic of China': 'China', "United Kingdom of Great Britain and Northern Ireland": 'United Kingdom',
         "Hong Kong (S.A.R.)": 'Hong Kong', 'Republic of Korea': 'South Korea', 'Iran, Islamic Republic of...': 'Iran',
         'I do not wish to disclose my location': 'Other'})


Data19['Country'] = Countries_func(Data19['Country'])
Data18['Country'] = Countries_func(Data18['Country'])
Data17['Country'] = Countries_func(Data17['Country'])
del Countries_func


Temp = ['Prefer to self-describe', 'Prefer not to say','Non-binary, genderqueer, or gender non-conforming',
        'A different identity', np.nan]
Data17.loc[Data17.Gender.isin(Temp), 'Gender'] = 'Other'
Data18.loc[Data18.Gender.isin(Temp), 'Gender'] = 'Other'
Data19.loc[Data19.Gender.isin(Temp), 'Gender'] = 'Other'
del Temp

Continent

In [8]:
def Coutry_Continent(x):
    try:
        Out = pycountry_convert.country_name_to_country_alpha2(x, cn_name_format="default")
        Out = pycountry_convert.country_alpha2_to_continent_code(Out)
        Out = pycountry_convert.convert_continent_code_to_continent_name(Out)
    except:
        Out = np.nan
    return Out

Data17['Continent'] = Data17.Country.apply(lambda x: Coutry_Continent(x))
Data18['Continent'] = Data18.Country.apply(lambda x: Coutry_Continent(x))
Data19['Continent'] = Data19.Country.apply(lambda x: Coutry_Continent(x))

del Coutry_Continent

Age Group

In [9]:
def Age_Group(x):
    if 18<= x <= 21: Out = '18-21'
    elif 22<= x <= 24: Out = '22-24'
    elif 25<= x <= 29: Out = '25-29'
    elif 30<= x <= 34: Out = '30-34'
    elif 35<= x <= 39: Out = '35-39'
    elif 40<= x <= 44: Out = '40-44'
    elif 45<= x <= 49: Out = '45-49'
    elif 50<= x <= 54: Out = '50-54'
    elif 55<= x <= 59: Out = '55-59'
    elif 60<= x <= 69: Out = '60-69'
    elif 70<= x: Out = '70+'
    else: Out = np.nan
    return Out
Data17['Age Group'] = Data17['Age'].apply(lambda x: Age_Group(x))
Data18['Age Group'] = Data18['Age Group'].replace({'70-79':'70+', '80+':'70+'})
del Age_Group

Formal Education

In [10]:
def Education_func(Col):
    return Col.replace(
        {'I did not complete any formal education past high school':'No formal education past high school',
         "Bachelor's degree":'Bachelor’s degree',"Master's degree": 'Master’s degree',
         "Some college/university study without earning a bachelor's degree":
         'Some college/university study without earning a bachelor’s degree'})

Data17['FormalEducation'] = Education_func(Data17['FormalEducation'])
Data18['FormalEducation'] = Education_func(Data18['FormalEducation'])
Data19['FormalEducation'] = Education_func(Data19['FormalEducation'])

Current Salary

In [11]:
Data18.CurrentSalary.replace({'I do not wish to disclose my approximate yearly compensation':np.nan},inplace = True)
Data19.CurrentSalary.replace({'I do not wish to disclose my approximate yearly compensation':np.nan},inplace = True)

Exploratory Data Analysis

Responses by Years

In [12]:
Temp = pd.DataFrame({'Year':[2017, 2018, 2019], 'Responses':[Data19.shape[0],Data18.shape[0],Data17.shape[0]]})

fig = px.bar(Temp, y= 'Year', x= 'Responses', orientation='h', text = 'Responses', height= 250)
fig.update_traces(marker_color='lightYellow', marker_line_color='darkRed',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig['layout']['xaxis'].update(range=[0, 25e3])
fig.update_layout(title = 'Number of Responses by Year', plot_bgcolor= 'white')
fig.show()

A quick comparison between the number of responses by year shows that the number of responses in 2018 is the highest.

Responses by Countries

In [13]:
Top = 10
Colors = ['lavender','steelblue','royalblue']
Temp = Data19.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2019'})
Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2018'})
Temp = Temp.join(Temp0)
Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'2017'})
Temp = Temp.join(Temp0)
Temp = Temp.fillna(0).astype(int).reset_index(drop = False)
Temp.sort_values(by=['2019', '2018', '2017'], ascending=False, inplace = True)
Temp['2017Percentage'] = np.round(100*Temp['2017']/Temp['2017'].sum(),2)
Temp['2018Percentage'] = np.round(100*Temp['2018']/Temp['2018'].sum(),2)
Temp['2019Percentage'] = np.round(100*Temp['2019']/Temp['2019'].sum(),2)
Temp = Temp.loc[Temp.Country != 'Other']
Temp = Temp[:Top]
TopCoutries = Temp.Country.tolist()
del Temp0

fig = go.Figure()

fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2019'], name='Responses in 2019', marker_color= Colors[0],
                     text=Temp['2019Percentage'], textposition='inside', ))
fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2018'], name='Responses in 2018', marker_color= Colors[1],
                     text=Temp['2018Percentage'], textposition='inside'))
fig.add_trace(go.Bar(x= Temp.Country, y= Temp['2017'], name='Responses in 2017', marker_color= Colors[2],
                     text=Temp['2017Percentage'], textposition='inside'))

fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'Number of Responses by Country (Top %i)' % Top, plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

It can be seen that each year, the highest number of responses are from India and the United States.

In [14]:
Temp = Data19.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.loc[~Temp.Country.isin(['Other'])]
Temp['alpha3'] = Temp.Country.apply(lambda x: pycountry_convert.country_name_to_country_alpha3(x, cn_name_format="default"))

fig = px.choropleth(Temp, locations= 'alpha3', color="Count", hover_name="Country",
                    animation_frame="Year", range_color=[0,5e3], color_continuous_scale="Greens")
fig.show()

Responses by Gender

In [15]:
Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])

Temp = Data17.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
                     values=Temp.Count.values,
                     name= '2017',
                     textfont=dict(size=16),
                     marker=dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
Temp = Data18.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
                     values=Temp.Count.values,
                     textfont=dict(size=16),
                     name= '2018'), 1, 2)
Temp = Data19.groupby(['Gender'])['Gender'].agg({'count'}).rename(columns = {'count':'Count'}).reset_index(drop = False)
fig.add_trace(go.Pie(labels=Temp.Gender.values,
                     values=Temp.Count.values,
                     textfont=dict(size=16),
                     name= '2019'), 1, 3)

fig.update_traces(hole=.6, marker_line_color='black', marker_line_width=1, opacity=1)

fig.update_layout(title="Gender Distribution", font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='2017', x=0.11, y=0.5, font_size=20, showarrow=False),
                               dict(text='2018', x=0.5, y=0.5, font_size=20, showarrow=False),
                               dict(text='2019', x=0.88, y=0.5, font_size=20, showarrow=False)])
fig.show()

It can be seen that each year, the majority of the participants are men. This graph can be specified by the country as follows.

In [16]:
Temp = Data19.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp['Percentage'] = 0
for i in Temp.Country.unique():
    Temp.loc[Temp.Country == i ,'Percentage'] = np.round(100*Temp.loc[Temp.Country == i ,'count']/Temp.loc[Temp.Country == i ,'count'].sum(),2)
    
Temp0 = Data18.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp0['Percentage'] = 0
for i in Temp0.Country.unique():
    Temp0.loc[Temp0.Country == i ,'Percentage'] = np.round(100*Temp0.loc[Temp0.Country == i ,'count']/Temp0.loc[Temp0.Country == i ,'count'].sum(),2)
    
Temp = pd.concat([Temp, Temp0])
Temp0 = Data17.groupby(['Country','Gender'])['Gender'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp0['Percentage'] = 0
for i in Temp0.Country.unique():
    Temp0.loc[Temp0.Country == i ,'Percentage'] = np.round(100*Temp0.loc[Temp0.Country == i ,'count']/Temp0.loc[Temp0.Country == i ,'count'].sum(),2)
    
Temp = pd.concat([Temp, Temp0])
del Temp0
Temp = Temp.loc[Temp.Country.isin(TopCoutries)]
Temp.sort_values(by=['Country','Year','Gender'], inplace = True)
Temp.reset_index(drop = False, inplace = True)

# fig = go.Figure()
fig = make_subplots(rows=1, cols=3, subplot_titles=('2017', '2018', '2019'))

Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
Name = ['Male', 'Female', 'Other']

Y = 2017
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', ), row=1, col=1)

Y = 2018
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=2)
    
Y = 2019
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Country, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=3)
    
fig.update_layout(barmode='relative')

fig.update_traces(marker_line_color='black', marker_line_width= 0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 100])
fig.update_layout(title = 'Number of Responses by Country (Top %i)' % Top, plot_bgcolor= 'white')
fig.show()

Responses by Continent

The number and percentage of the participants can be analyzed by continent as well.

In [17]:
C = ['deepskyblue','GreenYellow','OrangeRed', 'violet','LimeGreen','Olive']
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])

Temp = Data17.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
                            .reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
                     values=Temp.Count.values,
                     name= '2017',
                     textfont=dict(size=16),
                     marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
Temp = Data18.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
                            .reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
                     values=Temp.Count.values,
                     textfont=dict(size=16),
                     name= '2018'), 1, 2)
Temp = Data19.groupby(['Continent'])['Continent'].agg({'count'}).rename(columns = {'count':'Count'})\
                            .reset_index(drop = False).sort_values(by=['Continent'])
fig.add_trace(go.Pie(labels=Temp.Continent.values,
                     values=Temp.Count.values,
                     textfont=dict(size=16),
                     name= '2019'), 1, 3)

fig.update_traces(hole=.6, marker_line_color='black', marker_line_width=1, opacity=1)

fig.update_layout(title="Responses by Continent", font=dict(size=14), legend=dict(orientation="v"),
                  annotations=[dict(text='2017', x=0.11, y=0.5, font_size=20, showarrow=False),
                               dict(text='2018', x=0.5, y=0.5, font_size=20, showarrow=False),
                               dict(text='2019', x=0.88, y=0.5, font_size=20, showarrow=False)])
fig.show()

Temp = Data17.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp['Percentage'] = 0
for i in Temp.Continent.unique():
    Temp.loc[Temp.Continent == i, 'Percentage'] = Temp.loc[Temp.Continent == i, 'count']/Temp.loc[Temp.Continent == i, 'count'].sum()
Temp['Percentage'] = np.round(100* Temp['Percentage'],2)
Temp['Year'] = 2017

Temp0 = Data18.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp0['Percentage'] = 0
for i in Temp0.Continent.unique():
    Temp0.loc[Temp0.Continent == i, 'Percentage'] = Temp0.loc[Temp0.Continent == i, 'count']/Temp0.loc[Temp0.Continent == i, 'count'].sum()
Temp0['Percentage'] = np.round(100* Temp0['Percentage'],2)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])

Temp0 = Data19.groupby(['Continent','Gender'])['Continent'].agg({'count'}).reset_index(drop = False).sort_values(by=['Continent'])
Temp0['Percentage'] = 0
for i in Temp0.Continent.unique():
    Temp0.loc[Temp0.Continent == i, 'Percentage'] = Temp0.loc[Temp0.Continent == i, 'count']/Temp0.loc[Temp0.Continent == i, 'count'].sum()
Temp0['Percentage'] = np.round(100* Temp0['Percentage'],2)
Temp0['Year'] = 2019
Temp = pd.concat([Temp,Temp0])

Temp.sort_values(by=['Continent','Year','Gender'], inplace = True)
Temp.reset_index(drop = False, inplace = True)

fig = make_subplots(rows=1, cols=3, subplot_titles=('2017', '2018', '2019'))

Colors = ['RoyalBlue', 'salmon', 'ForestGreen']
Name = ['Male', 'Female', 'Other']

Y = 2017
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', ), row=1, col=1)

Y = 2018
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=2)
    
Y = 2019
for i in range(len(Name)):
    Temp0 = Temp.loc[(Temp.Year == Y) & (Temp.Gender == Name[i])]
    fig.add_trace(go.Bar(name = Name[i], x= Temp0.Continent, y= Temp0['Percentage'], marker_color= Colors[i],
                         text=Temp0['Percentage'], textposition='inside', showlegend=False), row=1, col=3)
    
fig.update_layout(barmode='relative')

fig.update_traces(marker_line_color='black', marker_line_width= 0.5, opacity=1)
fig['layout']['yaxis'].update(range=[0, 100])
fig.update_layout(title = 'Number of Responses by Continent', plot_bgcolor= 'white')
fig.show()
In [18]:
fig = go.Figure()
C = ['#9b59b6', '#e74c3c', '#34495e']
Temp = Data17.loc[Data17.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2017',
    marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2018',
    marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'North America'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2019',
    marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses (North America)', plot_bgcolor= 'white', width=600)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

fig = go.Figure()
Temp = Data17.loc[Data17.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2017',
    marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2018',
    marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'Europe'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2019',
    marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 1e3])
fig.update_layout(title = 'The Number of Responses (Europe)', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

fig = go.Figure()
Temp = Data17.loc[Data17.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2017',
    marker_color= C[0]
))
Temp = Data18.loc[Data18.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2018',
    marker_color= C[1]
))
Temp = Data19.loc[Data19.Continent == 'Asia'].groupby(['Country'])['Country'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp.Country,
    y= Temp['count'],
    name='Responses in 2019',
    marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses (Asia)', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Responses by Age Groups

In [19]:
C = ['#9b59b6', '#e74c3c', '#34495e']

fig = go.Figure()

Temp = Data17.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp['Age Group'],
    y= Temp['count'],
    name='Responses in 2017',
    marker_color= C[0]
))

Temp = Data18.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp['Age Group'],
    y= Temp['count'],
    name='Responses in 2018',
    marker_color= C[1]
))
Temp = Data19.groupby(['Age Group'])['Age Group'].agg({'count'}).reset_index(drop = False)
fig.add_trace(go.Bar(
    x= Temp['Age Group'],
    y= Temp['count'],
    name='Responses in 2019',
    marker_color= C[2]
))
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 7e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.4, y=1.2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Responses by Age Categories

Age Categories Age Groups
Youth 18-24 years
Adults 25-59 years
Seniors 60+
In [20]:
Top = 20
Temp0 = Data19.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()[:Top]
Temp = Data19[['Age Group','Country']]
Temp['Age Categories'] = Temp['Age Group'].replace({'18-21': 'Youth', '22-24': 'Youth','25-29':'Adults', '30-34':'Adults',
                                                    '35-39':'Adults', '40-44':'Adults', '45-49':'Adults','50-54':'Adults',
                                                    '55-59':'Adults', '60-69': 'Seniors', '70+': 'Seniors'})
Temp = Temp.groupby(['Country','Age Categories'])['Age Categories'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp['Sort'] = 0
Temp.loc[Temp['Age Categories'] == 'Youth', 'Sort'] = 1
Temp.loc[Temp['Age Categories'] == 'Adults', 'Sort'] = 2
Temp.loc[Temp['Age Categories'] == 'Seniors', 'Sort'] = 2
Temp.sort_values(['Country','Sort'], inplace = True)
del Temp0

fig = px.bar(Temp, x= 'Country', y= 'count', barmode='group', color = 'Age Categories')
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['yaxis'].update(range=[0, 3e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.55, y=1.1))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Responses by Education

In [21]:
Temp = Data19.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
Temp0 = Data17.groupby(['FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2017
Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.sort_values(['FormalEducation','Year'])
Temp = Temp.astype('str')
fig = px.bar(Temp, y= 'FormalEducation', x= 'count', barmode='group', color = 'Year', orientation='h')
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.45, y=1.1))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [22]:
Top = 5

Temp0 = Data17.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data17.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
    Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)

del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
             hover_data=["Country", "count"],
             height=400, title='Responses in 2017')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()

Temp0 = Data18.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data18.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
    Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)

del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
             hover_data=["Country", "count"],
             height=400, title='Responses in 2018')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()

Temp0 = Data19.groupby(['Country'])['Country'].agg({'count'}).sort_values('count',ascending=False).index.tolist()
Temp0.remove('Other')
Temp0 = Temp0[:Top]
Temp = Data19.groupby(['Country','FormalEducation'])['FormalEducation'].agg({'count'}).reset_index(drop = False)
Temp = Temp.loc[Temp.Country.isin(Temp0)]
Temp0 = Temp.groupby(['Country'])['count'].agg({'sum'}).reset_index(drop = False)
Temp['Percentage'] = 0
for c in Temp.Country.unique():
    Temp.loc[Temp.Country == c,'Percentage'] = np.round(100*Temp.loc[Temp.Country == c,'count'].values/Temp0.loc[Temp0.Country == c, 'sum'].values,2)

del Temp0
fig = px.bar(Temp, y="Country", x="Percentage", color='FormalEducation', orientation='h',
             hover_data=["Country", "count"],
             height=400, title='Responses in 2019')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(plot_bgcolor= 'white')
fig.update_traces(marker_line_color='black', marker_line_width=0.5, opacity=1)
fig.show()

Current Job Title

In [23]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.imshow(WordCloud( background_color='white').generate(" ".join(Data19['CurrentJobTitle'].dropna())), interpolation='bilinear')
ax.axis('off')
ax.set_title('Job Titles in 2019',fontsize=20);
In [24]:
Temp = Data19.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
Temp['Year'] = 2019
Temp0 = Data18.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
Temp0['Year'] = 2018
Temp = pd.concat([Temp,Temp0])
# Temp0 = Data17.groupby(['CurrentJobTitle'])['CurrentJobTitle'].agg({'count'}).reset_index(drop = False)
# Temp0['Year'] = 2017
# Temp = pd.concat([Temp,Temp0])
del Temp0
Temp = Temp.sort_values(['CurrentJobTitle','Year'])
Temp = Temp.astype('str')
fig = px.bar(Temp, y= 'CurrentJobTitle', x= 'count', barmode='group', color = 'Year', orientation='h',height=800)
fig.update_traces(marker_line_color='black', marker_line_width=1, opacity=1)
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_layout(title = 'The Number of Responses', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.75, y=1.1))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Current Salary Range

In [25]:
Temp = Data18['CurrentSalary'].str.split(pat = "-", expand=True)
Temp.columns = ['SalaryMin','SalaryMax']
Temp.SalaryMin = Temp.SalaryMin.str.replace('+', '').str.replace(',', '')
Temp.SalaryMin = Temp.SalaryMin.fillna(0)
Temp.SalaryMin = Temp.SalaryMin.astype(int)*(1000)
Temp.SalaryMax = Temp.SalaryMax.str.replace(',', '')
Temp18 = pd.concat([Data18['CurrentSalary'], Temp], axis=1)
Temp18.loc[Temp18.CurrentSalary.isin(['300-400,000','400-500,000']), ['CurrentSalary','SalaryMin']] = '300-500,000', int(3e5)
Group18 = Temp18.groupby(['CurrentSalary','SalaryMin'])['SalaryMin'].agg({'count'}).reset_index()\
                                                                .sort_values(['SalaryMin']).reset_index(drop = True)
del Temp
Temp = Data19['CurrentSalary'].str.split(pat = "-", expand=True)
Temp.columns = ['SalaryMin','SalaryMax']
Temp.SalaryMin = Temp.SalaryMin.str.replace('$', '').str.replace('> ', '').str.replace(',', '')
Temp.SalaryMin = Temp.SalaryMin.fillna(0)
Temp.SalaryMin = Temp.SalaryMin.astype(int)
Temp.SalaryMax = Temp.SalaryMax.str.replace(',', '')
Temp19 = pd.concat([Data19['CurrentSalary'], Temp], axis=1)
Temp19.loc[Temp19.CurrentSalary == '$0-999', 'CurrentSalary'] ='0-999'
Temp19.loc[Temp19.CurrentSalary == '> $500,000', 'CurrentSalary'] ='500,000+'
Group19 = Temp19.groupby(['CurrentSalary','SalaryMin'])['SalaryMin'].agg({'count'}).reset_index()\
                                                                 .sort_values(['SalaryMin']).reset_index(drop = True)
del Temp

Temp = []
for s in Group18.SalaryMin.unique()[1:]:
    Temp0 = Group19.loc[Group19.SalaryMin < s,'CurrentSalary'].tolist()
    Temp0 = list(set(Temp0) - set(Temp))
    Temp19.loc[Temp19.CurrentSalary.isin(Temp0), 'CurrentSalary'] = Group18.loc[Group18.SalaryMin < s,
                                                                                'CurrentSalary'].tolist()[-1]
    Temp.extend(Temp0)
    
Temp19.loc[Temp19.SalaryMin>= 5e5, 'CurrentSalary'] ='500,000+'
del Temp0, Group18

Group = Temp18.groupby(['CurrentSalary','SalaryMin'])['CurrentSalary'].agg({'count'}).reset_index()\
                        .sort_values(['SalaryMin']).reset_index(drop = True)
Group = Group[['CurrentSalary','count']].merge(Temp19.groupby(['CurrentSalary'])['CurrentSalary']\
                                               .agg({'count'}).reset_index(drop = False),
                                               left_on='CurrentSalary', right_on='CurrentSalary')
Group.columns = ['CurrentSalary', '2018', '2019']
Group = Group.melt(id_vars=['CurrentSalary'], value_vars=['2018','2019'], var_name='Year', value_name='Count')


fig = px.bar(Group19, x= 'CurrentSalary', y= 'count', text = 'count')
fig['layout']['yaxis'].update(range=[0, 1600])
fig.update_layout(title = 'The Number of Responses (2019)', yaxis_title="Count", xaxis_title="Current Salary")
fig.update_traces(marker_color='skyblue', marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

fig = px.bar(Group, x= 'CurrentSalary', y= 'Count', color='Year', text = 'Count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 5e3])
fig.update_layout(title = 'The Number of Responses', yaxis_title="Count", xaxis_title="Current Salary")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white', legend=dict(orientation="h", x=0.75, y=1.1))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
In [26]:
Temp18 = pd.concat([Data18[['CurrentJobTitle','Country','FormalEducation']], Temp18], axis=1)
Temp19 = pd.concat([Data19[['CurrentJobTitle','Country','FormalEducation']], Temp19], axis=1)
Cols = ['CurrentSalary', 'SalaryMin', 'SalaryMax']
#
Temp18.loc[Temp18.CurrentSalary.isin(['10-20,000', '20-30,000', '30-40,000', '40-50,000']),
           Cols] = '10-50,000', int(1e4), int(5e4)
Temp18.loc[Temp18.CurrentSalary.isin(['50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000']),
           Cols] = '50-100,000', int(5e4), int(1e5)
Temp18.loc[Temp18.CurrentSalary.isin(['100-125,000', '125-150,000', '150-200,000']),
           Cols] = '100-200,000', int(1e5), int(2e5)
Temp18.loc[Temp18.CurrentSalary.isin(['200-250,000', '250-300,000', '300-500,000', '500,000+']),
           Cols] = '200,000+', int(2e5), np.nan
#
Temp19.loc[Temp19.CurrentSalary.isin(['10-20,000', '20-30,000', '30-40,000', '40-50,000']),
           Cols] = '10-50,000', int(1e4), int(5e4)
Temp19.loc[Temp19.CurrentSalary.isin(['50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000']),
           Cols] = '50-100,000', int(5e4), int(1e5)
Temp19.loc[Temp19.CurrentSalary.isin(['100-125,000', '125-150,000', '150-200,000']),
           Cols] = '100-200,000', int(1e5), int(2e5)
Temp19.loc[Temp19.CurrentSalary.isin(['200-250,000', '250-300,000', '300-500,000', '500,000+']),
           Cols] = '200,000+', int(2e5), np.nan

Temp = Temp18.groupby(['CurrentJobTitle','CurrentSalary','Country'])['CurrentSalary'].agg({'count'}).reset_index()
Temp['Sort'] = 0
Temp.loc[Temp.CurrentSalary == '10-50,000', 'Sort'] = 1
Temp.loc[Temp.CurrentSalary == '50-100,000', 'Sort'] = 2
Temp.loc[Temp.CurrentSalary == '100-200,000', 'Sort'] = 3
Temp.loc[Temp.CurrentSalary == '200,000+', 'Sort'] = 4
Temp.sort_values(['CurrentJobTitle','Country','Sort'], inplace = True)

Country = 'United States'
fig = px.bar(Temp.loc[Temp.Country == Country],
             x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 500])
fig.update_layout(title = 'Salaries and Job Titles in 2018 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Country = 'Canada'
fig = px.bar(Temp.loc[Temp.Country == Country],
             x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 70])
fig.update_layout(title = 'Salaries and Job Titles in 2018 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Temp = Temp19.groupby(['CurrentJobTitle','CurrentSalary','Country'])['CurrentSalary'].agg({'count'}).reset_index()
Temp['Sort'] = 0
Temp.loc[Temp.CurrentSalary == '10-50,000', 'Sort'] = 1
Temp.loc[Temp.CurrentSalary == '50-100,000', 'Sort'] = 2
Temp.loc[Temp.CurrentSalary == '100-200,000', 'Sort'] = 3
Temp.loc[Temp.CurrentSalary == '200,000+', 'Sort'] = 4
Temp.sort_values(['CurrentJobTitle','Country','Sort'], inplace = True)

Country = 'United States'
fig = px.bar(Temp.loc[Temp.Country == Country],
             x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 500])
fig.update_layout(title = 'Salaries and Job Titles in 2019 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Country = 'Canada'
fig = px.bar(Temp.loc[Temp.Country == Country],
             x= 'CurrentJobTitle', y= 'count', color='CurrentSalary', text = 'count', barmode='group')
fig['layout']['yaxis'].update(range=[0, 60])
fig.update_layout(title = 'Salaries and Job Titles in 2019 (%s)' % Country)
fig.update_layout(xaxis_title="Count", yaxis_title="Current Job Title")
fig.update_traces(marker_line_color='navy', marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide', plot_bgcolor= 'white')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Activities

In [27]:
def mysplit(Text, S):
    _, Out  = Text.split(S)
    return Out

def mysplit2(Text):
    Out, _, _ = Text.partition(' (')
    return Out
In [28]:
S = 'Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - '

Temp18 = Data18[Search_df(Data18,S)]
Temp18.columns = [mysplit(x, S) for x in Temp18.columns]
Temp18 = Temp18.agg({'count'}).T.reset_index(drop = False)
Temp18.columns = ['Activities', 'Count']
Temp18['Year'] = '2018'
Temp19 = Data19[Search_df(Data19,S)]

Temp19.columns = [mysplit(x, S) for x in Temp19.columns]
Temp19 = Temp19.agg({'count'}).T.reset_index(drop = False)
Temp19.columns = ['Activities', 'Count']
Temp19['Year'] = '2019'
Temp = pd.concat([Temp18,Temp19])
del Temp18, Temp19

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6), sharey = True)
_ = sns.barplot(ax = ax, y= 'Activities', x= 'Count', hue='Year', edgecolor='k',  hatch="///", data=Temp)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 12)
_ = ax.set_xlim([0,1e4])
_ = ax.legend(bbox_to_anchor=(0.7, 0.1), fontsize = 14)

Media Sources

In [29]:
S = 'Who/what are your favorite media sources that report on data science topics? (Select all that apply) - Selected Choice - '
Col = 'Media Sources'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.columns = [Col, 'Count']

Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='lightskyblue', marker_line_color='navy',
                  marker_line_width=1, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Data Science Courses

In [30]:
S = 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - '
Col = 'Data Science Courses'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.columns = [Col, 'Count']

Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='orchid', marker_line_color='indigo',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Integrated Development Environments

In [31]:
S = """Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all that apply) - Selected Choice - """
Col = """IDE's"""

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='limegreen', marker_line_color='darkgreen',
                  marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Notebook Host

In [32]:
S = 'Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selected Choice -  '
Col = 'Notebook Host'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=400)
fig.update_traces(marker_color='bisque', marker_line_color='darkorange',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Programming Languages

In [33]:
S = 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - '
Col = 'Programming Languages'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='lightcoral', marker_line_color='darkred',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 14e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Visualization Libraries

In [34]:
S = 'What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice -'
Col = 'Visualization Libraries'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='whitesmoke', marker_line_color='dimgray',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
# fig.update_layout(plot_bgcolor= 'white')
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
# fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
# fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Specialized Hardwares

In [35]:
S = 'Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice -'
Col = 'Specialized Hardwares'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=300)
fig.update_traces(marker_color='pink', marker_line_color='mediumvioletred',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

ML Algorithms

In [36]:
S = 'Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice -'
Col = 'ML Algorithms'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=500)
fig.update_traces(marker_color='mediumpurple', marker_line_color='darkred',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 12e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

ML Tools

In [37]:
S = 'Which categories of ML tools do you use on a regular basis?  (Select all that apply) - Selected Choice - '
Col = 'ML Tools'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=450)
fig.update_traces(marker_color='lightgreen', marker_line_color='darkolivegreen',
                  marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Computer Vision Methods

In [38]:
S = 'Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected Choice - '
Col = 'Computer Vision Methods'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)
display(Temp.style.hide_index())

Temp[Col] = Temp[Col].apply(lambda x: mysplit2(x))
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 5), sharey = True)
_ = sns.barplot(ax = ax, y= Col, x= 'Count', palette = 'summer', edgecolor='k',  hatch="///", data=Temp)
_ = ax.set_yticklabels(ax.get_yticklabels(), fontsize = 12)
_ = ax.set_xlim([0,3.5e3])
Computer Vision Methods Count
General purpose image/video tools (PIL, cv2, skimage, etc) 2207
Generative Networks (GAN, VAE, etc) 1081
Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc) 3187
Image segmentation methods (U-Net, Mask R-CNN, etc) 2061
None 1203
Object detection methods (YOLOv3, RetinaNet, etc) 1872
Other 51

Natural Language Processing (NLP)

In [39]:
S = 'Which of the following natural language processing (NLP) methods do you use on a regular basis?  (Select all that apply) - Selected Choice - '
Col = 'Natural Language Processing (NLP)'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height=400)
fig.update_traces(marker_color='cornsilk', marker_line_color='darkgoldenrod',
                  marker_line_width=2, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 2.5e3])
fig.update_layout(title = '%s' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Machine Learning Frameworks

In [40]:
S = 'Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Machine Learning Frameworks'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='lightsalmon', marker_line_color='darkred',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 10e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Cloud Computing Platforms

In [41]:
S = 'Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Cloud Computing Platforms'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='yellowgreen', marker_line_color='darkgreen',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 3e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Cloud Computing Products

In [42]:
S = 'Which specific cloud computing products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Cloud Computing Products'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='white', marker_line_color='dimgray',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 3.5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.show()

Big Data / Analytics Products

In [43]:
S = 'Which specific big data / analytics products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Big Data / Analytics Products'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='orchid', marker_line_color='royalblue',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 4.5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Machine Learning Products

In [44]:
S = 'Which of the following machine learning products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Machine Learning Products'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='tomato', marker_line_color='dimgray',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 5e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Automated Machine Learning Tools

In [45]:
S = 'Which automated machine learning tools (or partial AutoML tools) do you use on a regular basis?  (Select all that apply) - Selected Choice -'
Col = 'Automated Machine Learning Tools'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='royalblue', marker_line_color='navy',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 6e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()

Automated Machine Learning Tools

In [46]:
S = 'Which of the following relational database products do you use on a regular basis? (Select all that apply) - Selected Choice -'
Col = 'Automated Machine Learning Tools'

Temp = Data19[Search_df(Data19,S)]
Temp.columns = [mysplit(x, S) for x in Temp.columns]
Temp = Temp.agg({'count'}).T.reset_index(drop = False)
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.lstrip())
Temp.iloc[:,0] = Temp.iloc[:,0].apply(lambda x: x.rstrip())
Temp.columns = [Col, 'Count']
Temp.sort_values(by=[Col], inplace = True)

fig = px.bar(Temp, y= Col, x= 'Count', orientation='h', text = 'Count', barmode='group', height= 500)
fig.update_traces(marker_color='darkslategray', marker_line_color='black',
                  marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 4e3])
fig.update_layout(title = '%s (2019)' % Col)
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()